Data Science 112 Final Project: Data Extraction bold text¶
Tricks for Tips : A Data-Driven Analysis of Tipping Influences¶
by Amelie and Spurti
This project explores variables that impact the tipping percentages.
This section includes visualizations and summary statistics that helped us identify temporal, spatial, and contextual trends in tipping behavior. Our goal was to use visual and statistical cues to refine our understanding of the collected data and inform our future analysis and predictive model development.
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/My Drive/data.csv")
df['pickup_hour'] = pd.to_datetime(df['pickup_time'], format="%H:%M:%S").dt.hour
df['tip_percent'] = (df['tip_amount']/ df['total_amount'])*100
heatmap_data = df.groupby(['day_of_week', 'pickup_hour'])['tip_percent'].mean().reset_index()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
agg = (
df
.groupby(['day_of_week','pickup_hour'])['tip_percent']
.mean()
.reset_index()
)
agg['day_of_week'] = pd.Categorical(agg['day_of_week'], categories=day_order, ordered=True)
heatmap_matrix = agg.pivot(index='day_of_week', columns='pickup_hour', values='tip_percent')
fig = px.imshow(
heatmap_matrix,
labels={
'x':'Hour of Day',
'y':'Day of Week',
'color':'Avg Tip %'
},
x=list(range(24)),
y=day_order,
color_continuous_scale='ice',
aspect='auto',
title="Heatmap of Average Tip % by Hour and Day of Week"
)
fig.update_xaxes(dtick=1)
fig.update_layout(
yaxis=dict(autorange='reversed'),
height=400, width=1200
)
fig.show()
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import plotly.graph_objects as go
hourly_tip = df.groupby('pickup_hour')['tip_percent'].mean().reindex(range(24), fill_value=0).reset_index()
hourly_tip['label'] = hourly_tip['pickup_hour'].astype(str) + ":00"
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=hourly_tip['tip_percent'],
theta=hourly_tip['label'],
mode='lines+markers',
line=dict(color="#2a9d8f", width=1),
marker=dict(color="#2a9d8f", size=5),
name='Avg Tip %'
))
highlight_sectors = [
(12, 14, "rgba(255, 165, 0, 0.15)", "Lunch"),
(18, 21, "rgba(100, 149, 237, 0.15)", "Dinner")
]
for start, end, color, label in highlight_sectors:
theta = [f"{h}:00" for h in range(start, end + 1)]
r_outer = [25] * len(theta)
r_inner = [0] * len(theta)
fig.add_trace(go.Scatterpolar(
r=r_outer + r_inner[::-1],
theta=theta + theta[::-1],
fill='toself',
fillcolor=color,
line=dict(width=0),
name=label,
))
fig.update_layout(
title="Radial Plot of Avg Tip % by Hour",
height=650,
width=650,
showlegend=True,
polar=dict(
angularaxis=dict(
direction="clockwise",
rotation=90
)
)
)
fig.show()
Analysis: We can see a peak of tippin amounts during meal times, especially at dinner times. It also peaks from 7-8am which is presumably the common time for adults to commute to work.
df['pickup_time'] = pd.to_datetime(df['pickup_time'], format="%H:%M:%S")
df['dropoff_time'] = pd.to_datetime(df['dropoff_time'], format="%H:%M:%S")
df['trip_duration_min'] = (df['dropoff_time'] - df['pickup_time']).dt.total_seconds() / 60
df.loc[df['trip_duration_min'] < 0, 'trip_duration_min'] += 24 * 60
df = df[df['trip_duration_min'] <= 120]
fig = px.scatter(
df,
x='trip_duration_min',
y='tip_percent',
title='Tip Percentage vs. Trip Duration',
labels={'trip_duration_min': 'Trip Duration (minutes)', 'tip_percent': 'Tip Percentage'},
opacity=0.2,
marginal_x='histogram',
marginal_y='histogram',
trendline='lowess',
trendline_color_override="black"
)
fig.update_layout(
height=550,
width=800,
)
Analysis: The lowess trendline indicates that the duration of a trip does not affect tip percentage necessarily.
zip_weather = (
df[df["tip_percent"] > 0]
.groupby("pickup_zipcode")
.agg(
tip_percent=("tip_percent","mean"),
temperature_2m=("temperature_2m","mean"),
pickup_lat=("pickup_lat","mean"),
pickup_lng=("pickup_lng","mean")
).reset_index()
)
loc_agg = (
df.groupby(["pickup_lat","pickup_lng"])
.agg(avg_tip=("tip_percent","mean"))
.reset_index()
)
center1 = dict(lat=zip_weather["pickup_lat"].mean(),
lon=zip_weather["pickup_lng"].mean())
center2 = dict(lat=loc_agg["pickup_lat"].mean(),
lon=loc_agg["pickup_lng"].mean())
max_temp = zip_weather["temperature_2m"].max()
max_tip = loc_agg["avg_tip"].max()
sizes_zip = (zip_weather["tip_percent"]/zip_weather["tip_percent"].max())*25 + 5
sizes_loc = (loc_agg["avg_tip"]/loc_agg["avg_tip"].max())*25 + 5
fig = go.Figure()
fig.add_trace(go.Scattermapbox(
lat=zip_weather["pickup_lat"],
lon=zip_weather["pickup_lng"],
mode="markers",
marker=dict(
size=sizes_zip,
color=zip_weather["temperature_2m"],
colorscale="Turbo",
cmin=zip_weather["temperature_2m"].min(),
cmax=max_temp,
colorbar=dict(title="Avg Temp",len=0.4,y=0.75)
),
customdata=zip_weather[["pickup_zipcode","tip_percent"]].values,
hovertemplate=(
"ZIP: %{customdata[0]}<br>"
"Avg Tip: %{customdata[1]:.1f}%<br>"
"Temp: %{marker.color:.1f}°C<extra></extra>"
),
name="ZIP Weather+Tip",
subplot="mapbox1"
))
fig.add_trace(go.Scattermapbox(
lat=loc_agg["pickup_lat"],
lon=loc_agg["pickup_lng"],
mode="markers",
marker=dict(
size=sizes_loc,
color=loc_agg["avg_tip"],
colorscale="haline",
cmin=0,
cmax=max_tip,
colorbar=dict(title="Avg Tip %",len=0.4,y=0.25)
),
hovertemplate="Avg Tip: %{marker.color:.1f}%<extra></extra>",
name="Loc Avg Tip",
subplot="mapbox2"
))
fig.update_layout(
mapbox1=dict(
domain={"x":[0,0.49],"y":[0,1]},
style="carto-positron",
center=center1,
zoom=9
),
mapbox2=dict(
domain={"x":[0.51,0.99],"y":[0,1]},
style="carto-positron",
center=center2,
zoom=9
),
height=600,
width=1000,
showlegend=False,
title_text="ZIP Temp & Tip | Avg Tip by Location"
)
fig.show()
Analysis: There is a positive correlation between tipping and the temperature. Areas such as manhattan with higher temperatures, have much more tipping.
import numpy as np
df['pickup_datetime'] = pd.to_datetime(
df['pickup_date'].astype(str) + ' ' + df['pickup_time'].astype(str),
errors='coerce'
)
df.loc[:, 'month'] = df['pickup_datetime'].dt.month
df.loc[:, 'season'] = np.where(df['month'].isin([12, 1, 2]), 'Winter', 'Summer')
df.loc[:, 'date'] = df['pickup_datetime'].dt.date
daily_avg = (
df.groupby(['date', 'season'])
.agg(
tip_percent = ('tip_percent', 'mean'),
precipitation = ('precipitation', 'mean'),
wind_speed = ('wind_speed_10m','mean'),
snowfall_cm = ('snowfall', 'mean'),
snow_depth_cm = ('snow_depth', 'mean')
)
.reset_index()
)
factors = [
('precipitation', 'Precipitation (mm)'),
('wind_speed', 'Wind Speed (km/h)'),
('snowfall_cm', 'Snowfall (cm)'),
('snow_depth_cm', 'Snow Depth (cm)')
]
fig = make_subplots(
rows=2, cols=2,
subplot_titles=[label for _, label in factors],
horizontal_spacing=0.1, vertical_spacing=0.1
)
season_colors = {'Winter': 'royalblue', 'Summer': 'crimson'}
for idx, (col_name, x_label) in enumerate(factors):
r = idx // 2 + 1
c = idx % 2 + 1
for season in ['Winter', 'Summer']:
subset = daily_avg[daily_avg['season'] == season]
x = subset[col_name]
y = subset['tip_percent']
fig.add_trace(
go.Scatter(
x=x, y=y,
mode='markers',
marker=dict(color=season_colors[season], size=6, opacity=0.5),
name=season if idx == 0 else None,
showlegend=(idx == 0)
),
row=r, col=c
)
if len(x.dropna()) >= 2:
m, b = np.polyfit(x.dropna(), y.loc[x.notna()], 1)
x_line = np.linspace(x.min(), x.max(), 100)
y_line = m * x_line + b
fig.add_trace(
go.Scatter(
x=x_line, y=y_line,
mode='lines',
line=dict(color=season_colors[season], width=2),
name=f'{season} Trend' if idx == 0 else None,
showlegend=(idx == 0)
),
row=r, col=c
)
fig.update_xaxes(title_text=x_label, row=r, col=c)
fig.update_yaxes(title_text='Tip %', ticksuffix='%', range=[0,60], row=r, col=c)
fig.update_layout(
title_text='Weather Conditions vs Tip Percentage by Season',
height=800, width=1000,
)
fig.show()
<ipython-input-46-f1c2581e53ff>:3: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
We see that there is a slight positive correlation between precipitation and tipping as well as snowfall and tipping. Customers may tip more due to harsher environmental conditions and appreciation rides and drivers more.
pickup_avg = (
df[df["tip_percent"] > 0]
.groupby(["pickup_borough","pickup_zone"])["tip_percent"]
.mean()
.reset_index()
.rename(columns={
"pickup_borough":"Borough",
"pickup_zone":"Zone",
"tip_percent":"Avg Tip %"
})
)
pickup_avg = pickup_avg.sort_values("Avg Tip %", ascending=False)
fig = px.bar(
pickup_avg,
x="Zone",
y="Avg Tip %",
color="Borough",
color_discrete_sequence=px.colors.qualitative.Safe,
category_orders={"Zone": pickup_avg["Zone"].tolist()},
title="Avg Tip % by Zone (colored by Borough)",
labels={"Avg Tip %":"Avg Tip %", "Zone":"Zone"}
)
fig.update_layout(
height=600,
margin=dict(t=80, l=50, r=50, b=150),
legend_title_text="",
)
fig.update_xaxes(tickangle=-45)
fig.show()
We are able to see that locations with the highest tipping percentages are Washington Heights North, Manhattanville, Washington Heights South, and a few other places in Manhattan.
avg_tip = (
df
.groupby('dropoff_zone', as_index=False)['tip_amount']
.mean()
.rename(columns={'tip_amount':'avg_tip_amount'})
.sort_values('avg_tip_amount', ascending=False)
.head(10)
)
total_tip_by_zone = df.groupby('dropoff_zone')['tip_amount'].sum()
prop = (
(total_tip_by_zone / total_tip_by_zone.sum() * 100)
.sort_values(ascending=False)
.head(10)
.reset_index()
.rename(columns={'tip_amount':'tip_share_pct'})
)
fig = make_subplots(
rows=1, cols=2,
subplot_titles=(
"Top 10 Avg Tip Amount by Drop-Off Zone",
"Top 10 Tip Share by Drop-Off Zone"
),
)
fig.add_trace(
go.Bar(
x=avg_tip['avg_tip_amount'],
y=avg_tip['dropoff_zone'],
orientation='h',
marker=dict(
color=avg_tip['avg_tip_amount'],
colorscale='purd',
colorbar=dict(title='Avg Tip ($)')
),
text=avg_tip['avg_tip_amount'].map(lambda v: f"${v:.2f}"),
textposition='auto'
),
row=1, col=1
)
fig.add_trace(
go.Bar(
x=prop['tip_share_pct'],
y=prop['dropoff_zone'],
orientation='h',
marker=dict(
color=avg_tip['avg_tip_amount'],
colorscale='purp',
colorbar=dict(title='Avg Tip ($)')
),
text=prop['tip_share_pct'].map(lambda v: f"{v:.1f}%"),
textposition='auto'
),
row=1, col=2
)
fig.update_layout(
title_text="Top 10 Drop-Off Zones: Tip Amount & Tip Share",
height=400, width=2050,
showlegend=False,
)
fig.update_yaxes(autorange='reversed')
fig.update_xaxes(title_text="Average Tip Amount (USD)", row=1, col=1)
fig.update_xaxes(title_text="Tip Share of Total (%)", row=1, col=2)
fig.show()
df = pd.read_csv("/content/drive/My Drive/data.csv")
df['tip_percent'] = (df['tip_amount']/ df['total_amount'])*100
zone_stats = (
df
.groupby('pickup_zone', as_index=False)
.agg(
avg_tip=('tip_percent', 'mean'),
trip_count=('tip_percent', 'size'),
median_income=('pickup_income', 'median')
)
)
fig = px.scatter(
zone_stats,
x='median_income',
y='avg_tip',
size='trip_count',
color='avg_tip',
color_continuous_scale='earth',
size_max=40,
hover_name='pickup_zone',
labels={
'median_income': 'Median Household Income (USD)',
'avg_tip': 'Average Tip %',
'trip_count': 'Trip Count'
},
title='Avg Tip % vs. Median Income by Pickup Zone'
)
fig.update_layout(
xaxis=dict(
title_standoff=10,
tickprefix='$',
tickformat=','
),
yaxis=dict(title_standoff=10),
legend_title_text='Avg Tip %',
height=600,
width=800
)
fig.show()
Analysis: There is a weak positive correlation that can be seen with the median income and average tip percentage. This means that depending on the pick up zone, the average tip a driver can get will differ greatly.
corr_cols = [
'tip_percent',
'trip_duration',
'pickup_income',
'dropoff_income',
'passenger_count',
'fare_amount',
'congestion_surcharge',
'trip_distance'
]
corr_df = df[corr_cols].corr()
fig = px.imshow(
corr_df,
text_auto='.2f',
color_continuous_scale='sunset',
aspect='auto',
labels=dict(x="Variable", y="Variable", color="Pearson r"),
title="Correlation Matrix of Tip%, Trip & Income"
)
fig.update_xaxes(side="bottom")
fig.update_layout(
width=700, height=500,
)
fig.show()
Analysis: The strongest positive correlations are with the income locations of the drop off areas and the tip percentage as well as the congestion surcharge with the tip percentage. Congestion surchage is quite already an added price so the correlation is understandable. However, the
# @markdown Run this cell to download this notebook as a webpage, `_NOTEBOOK.html`.
import google, json, nbformat
x
# Get the current notebook and write it to _NOTEBOOK.ipynb
raw_notebook = google.colab._message.blocking_request("get_ipynb",
timeout_sec=30)["ipynb"]
with open("_NOTEBOOK.ipynb", "w", encoding="utf-8") as ipynb_file:
ipynb_file.write(json.dumps(raw_notebook))
# Use nbconvert to convert .ipynb to .html.
!jupyter nbconvert --to html --log-level WARN _NOTEBOOK.ipynb
# Download the .html file.
google.colab.files.download("_NOTEBOOK.html")